import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
# import plotly.express as px
# from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import datetime
# import warnings
# warnings.filterwarnings("ignore")
# from catboost import Pool, CatBoostRegressor
# import plotly.io as pio
# pio.renderers.default = 'colab'
import scipy.stats as stats
import lightgbm as lgb
import shap
# from statsmodels.regression.quantile_regression import QuantReg
# import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from bayes_opt import BayesianOptimization
from sklearn import base
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.linear_model import RidgeCV,Ridge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from functools import partial
Project Overview
Build a ML model to predict total delivery duration seconds, defined as the time from order creation by customers (created_at) to delivery (actual_delivery_time).
Underestimating delivery time is roughly twice as costly as overestimating it. Orders that are very early / late are also much worse than those that are only slightly early / late.
Detailed dictionary of the data is in the appendix.
Denote $\epsilon_i = g(x_i) - \hat g(x_i) = y_i - \hat g(x_i)$. The Jacobian and Hessian are
Since late delivery (positive residuals) is twice as costly as early ones, we can set $\alpha = 2/3$ The custom loss function is then
# following train API lightgbm
# def qmse(y_true, y_pred):
# assert len(y_true) == len(y_pred)
# # assert alpha<1 and alpha>0,"alpha should be in (0,1)"
# alpha=2/3
# residual = y_true - y_pred
# grad = np.where(residual<0, (alpha-1)*residual, -alpha*residual)
# hess = np.where(residual<0, 1-alpha, alpha)
# return grad, hess
# def qmse_metric(y_true, y_pred):
# # assert alpha<1 and alpha>0,"alpha should be in (0,1)"
# alpha=2/3
# residual = y_true - y_pred
# loss = np.where(residual < 0, (residual**2)*(1-alpha), alpha*residual**2)
# return "asym_mse", np.mean(loss), False
def qmse(y_pred, data):
y_true = data.get_label()
assert len(y_true) == len(y_pred)
# assert alpha<1 and alpha>0,"alpha should be in (0,1)"
alpha=2/3
residual = y_true - y_pred
grad = np.where(residual<0, (alpha-1)*residual, -alpha*residual)
hess = np.where(residual<0, 1-alpha, alpha)
return grad, hess
def qmse_metric(y_pred, data):
# assert alpha<1 and alpha>0,"alpha should be in (0,1)"
alpha=2/3
y_true = data.get_label()
residual = y_true - y_pred
loss = np.where(residual < 0, (residual**2)*(1-alpha), alpha*residual**2)
return "asym_mse", np.mean(loss), False
def convert_type(df,has_actual=False):
"""This function change data to appropriate types"""
df = df.copy()
df['created_at'] = pd.to_datetime(df['created_at'],utc=True)
df['created_at_pst']=df['created_at'].dt.tz_convert("America/Los_Angeles")
# categorical varibles
for cat in ['market_id','store_id','order_protocol']:
df[cat] = df[cat].astype('Int64').astype('category')
df['store_primary_category']=df['store_primary_category'].astype('category')
df['created_day_of_week'] = (df['created_at'].dt.dayofweek + 1).astype('category')
df['created_month'] = df['created_at'].dt.month.astype('category')
df['created_hour'] = df['created_at'].dt.hour.astype('category')
df['created_hour_pst'] = df['created_at_pst'].dt.hour.astype('category')
df['time_slot_pst'] = pd.qcut(df['created_hour_pst'],6).astype('category')
if has_actual:
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'],utc=True)
df['duration'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds()
return df.sort_values('created_at')
df = convert_type(pd.read_csv('historical_data.csv'),has_actual=True)
df_submit = convert_type(pd.read_csv('predict_data.csv'),has_actual=False)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 197428 entries, 2690 to 61787 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 196441 non-null category 1 created_at 197428 non-null datetime64[ns, UTC] 2 actual_delivery_time 197421 non-null datetime64[ns, UTC] 3 store_id 197428 non-null category 4 store_primary_category 192668 non-null category 5 order_protocol 196433 non-null category 6 total_items 197428 non-null int64 7 subtotal 197428 non-null int64 8 num_distinct_items 197428 non-null int64 9 min_item_price 197428 non-null int64 10 max_item_price 197428 non-null int64 11 total_onshift_dashers 181166 non-null float64 12 total_busy_dashers 181166 non-null float64 13 total_outstanding_orders 181166 non-null float64 14 estimated_order_place_duration 197428 non-null int64 15 estimated_store_to_consumer_driving_duration 196902 non-null float64 16 created_at_pst 197428 non-null datetime64[ns, America/Los_Angeles] 17 created_day_of_week 197428 non-null category 18 created_month 197428 non-null category 19 created_hour 197428 non-null category 20 created_hour_pst 197428 non-null category 21 time_slot_pst 197428 non-null category 22 duration 197421 non-null float64 dtypes: category(9), datetime64[ns, America/Los_Angeles](1), datetime64[ns, UTC](2), float64(5), int64(6) memory usage: 24.8 MB
df_submit.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 54778 entries, 27577 to 34166 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 54528 non-null category 1 created_at 54778 non-null datetime64[ns, UTC] 2 store_id 54778 non-null category 3 store_primary_category 53435 non-null category 4 order_protocol 54495 non-null category 5 total_items 54778 non-null int64 6 subtotal 54778 non-null int64 7 num_distinct_items 54778 non-null int64 8 min_item_price 54778 non-null int64 9 max_item_price 54778 non-null int64 10 total_onshift_dashers 50145 non-null float64 11 total_busy_dashers 50145 non-null float64 12 total_outstanding_orders 50145 non-null float64 13 estimated_order_place_duration 54778 non-null int64 14 estimated_store_to_consumer_driving_duration 54767 non-null float64 15 delivery_id 54778 non-null int64 16 platform 54778 non-null object 17 created_at_pst 54778 non-null datetime64[ns, America/Los_Angeles] 18 created_day_of_week 54778 non-null category 19 created_month 54778 non-null category 20 created_hour 54778 non-null category 21 created_hour_pst 54778 non-null category 22 time_slot_pst 54778 non-null category dtypes: category(9), datetime64[ns, America/Los_Angeles](1), datetime64[ns, UTC](1), float64(4), int64(7), object(1) memory usage: 7.0+ MB
There is one item standing out, it is created on 2014-10-19, but the others are all in 2015! This must be a data error, so I will drop it.
## Historical averages
df['created_at'].sort_values()
2690 2014-10-19 05:24:15+00:00
43519 2015-01-21 15:22:03+00:00
148754 2015-01-21 15:31:51+00:00
187014 2015-01-21 15:39:16+00:00
10265 2015-01-21 15:40:42+00:00
...
176616 2015-02-18 05:57:51+00:00
100474 2015-02-18 05:58:07+00:00
191692 2015-02-18 05:59:01+00:00
168114 2015-02-18 05:59:23+00:00
61787 2015-02-18 06:00:44+00:00
Name: created_at, Length: 197428, dtype: datetime64[ns, UTC]
df.loc[2690,:]
market_id 1 created_at 2014-10-19 05:24:15+00:00 actual_delivery_time 2015-01-25 19:11:54+00:00 store_id 3560 store_primary_category italian order_protocol 1 total_items 1 subtotal 1695 num_distinct_items 1 min_item_price 1595 max_item_price 1595 total_onshift_dashers NaN total_busy_dashers NaN total_outstanding_orders NaN estimated_order_place_duration 446 estimated_store_to_consumer_driving_duration 412.0 created_at_pst 2014-10-18 22:24:15-07:00 created_day_of_week 7 created_month 10 created_hour 5 created_hour_pst 22 time_slot_pst (19.0, 23.0] duration 8516859.0 Name: 2690, dtype: object
df.drop(index=2690,inplace=True)
df['created_month']=df['created_month'].cat.remove_categories([10])
Now, check the top few records with the highest duration. There are two records taking more than one day! That's probably due to reservation (i.e., submit order ahead of time). I don't think they are the focus of this study, since we focus on real-time same-day deliver (I assume for simplicity). So, I will drop them as well
df.sort_values("duration",ascending=False).head(20)[['created_at','actual_delivery_time','store_primary_category','duration','estimated_order_place_duration','estimated_store_to_consumer_driving_duration']]
| created_at | actual_delivery_time | store_primary_category | duration | estimated_order_place_duration | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|---|
| 185550 | 2015-01-28 08:34:06+00:00 | 2015-02-01 16:25:25+00:00 | dessert | 373879.0 | 251 | 476.0 |
| 27189 | 2015-02-16 02:24:09+00:00 | 2015-02-19 22:45:31+00:00 | indian | 332482.0 | 251 | 767.0 |
| 83055 | 2015-02-01 02:18:07+00:00 | 2015-02-01 18:08:39+00:00 | burger | 57032.0 | 251 | 602.0 |
| 190860 | 2015-02-16 02:31:05+00:00 | 2015-02-16 17:38:32+00:00 | indian | 54447.0 | 251 | 326.0 |
| 86952 | 2015-02-05 02:11:40+00:00 | 2015-02-05 15:34:38+00:00 | thai | 48178.0 | 251 | 787.0 |
| 76743 | 2015-02-15 04:17:35+00:00 | 2015-02-15 16:59:00+00:00 | pizza | 45685.0 | 446 | 540.0 |
| 105825 | 2015-02-08 04:07:51+00:00 | 2015-02-08 15:03:43+00:00 | alcohol | 39352.0 | 251 | 315.0 |
| 66787 | 2015-02-04 20:21:30+00:00 | 2015-02-05 07:02:27+00:00 | italian | 38457.0 | 446 | 648.0 |
| 175971 | 2015-02-12 20:25:17+00:00 | 2015-02-13 07:01:00+00:00 | mexican | 38143.0 | 446 | 594.0 |
| 139989 | 2015-01-21 20:56:51+00:00 | 2015-01-22 07:00:07+00:00 | mediterranean | 36196.0 | 251 | 684.0 |
| 51228 | 2015-02-02 21:53:08+00:00 | 2015-02-03 07:01:57+00:00 | cafe | 32929.0 | 446 | 829.0 |
| 63505 | 2015-01-24 08:19:17+00:00 | 2015-01-24 17:24:07+00:00 | american | 32690.0 | 251 | 730.0 |
| 31185 | 2015-02-14 22:23:13+00:00 | 2015-02-15 07:19:12+00:00 | japanese | 32159.0 | 251 | 859.0 |
| 109875 | 2015-01-27 18:18:24+00:00 | 2015-01-28 02:10:29+00:00 | italian | 28325.0 | 446 | 869.0 |
| 39989 | 2015-01-28 22:48:43+00:00 | 2015-01-29 06:38:50+00:00 | american | 28207.0 | 446 | 533.0 |
| 171404 | 2015-01-28 22:41:45+00:00 | 2015-01-29 06:13:08+00:00 | salad | 27083.0 | 251 | 246.0 |
| 29715 | 2015-02-14 21:08:06+00:00 | 2015-02-15 04:14:44+00:00 | mexican | 25598.0 | 251 | 507.0 |
| 103937 | 2015-01-25 01:00:54+00:00 | 2015-01-25 07:55:35+00:00 | other | 24881.0 | 251 | 798.0 |
| 79473 | 2015-02-15 20:28:16+00:00 | 2015-02-16 03:08:11+00:00 | catering | 23995.0 | 446 | 576.0 |
| 182592 | 2015-02-12 00:36:10+00:00 | 2015-02-12 06:58:02+00:00 | smoothie | 22912.0 | 251 | 916.0 |
df.drop(index=[185550,27189],inplace=True)
Strangely, there are 7 records with missing delivery time. Sometimes, it may be due to censoring issue that the most recent orders' delivery time is not observed yet. However, this is not the case here, since these 7 orders span across different days (it is hard to imagine it takes more than 1 day to deliver!).
As a result, this may just be a data issue and can be ignored.
df[df.actual_delivery_time.isna()].T
| 158967 | 170416 | 7670 | 109 | 78511 | 140635 | 115982 | |
|---|---|---|---|---|---|---|---|
| market_id | 2 | 5 | 2 | 3 | 4 | 2 | 4 |
| created_at | 2015-02-01 01:21:29+00:00 | 2015-02-01 01:36:33+00:00 | 2015-02-08 02:54:42+00:00 | 2015-02-10 21:51:54+00:00 | 2015-02-15 02:15:45+00:00 | 2015-02-15 02:21:42+00:00 | 2015-02-16 01:52:49+00:00 |
| actual_delivery_time | NaT | NaT | NaT | NaT | NaT | NaT | NaT |
| store_id | 314 | 2651 | 2340 | 1698 | 901 | 1661 | 1107 |
| store_primary_category | mexican | fast | japanese | sandwich | catering | dessert | pizza |
| order_protocol | 5 | 4 | 2 | 3 | 1 | 1 | 3 |
| total_items | 5 | 3 | 4 | 1 | 9 | 3 | 2 |
| subtotal | 3447 | 982 | 2860 | 1125 | 5050 | 4210 | 2094 |
| num_distinct_items | 3 | 3 | 3 | 1 | 6 | 3 | 2 |
| min_item_price | 225 | 165 | 390 | 975 | 375 | 865 | 599 |
| max_item_price | 1349 | 575 | 690 | 975 | 1125 | 1850 | 1195 |
| total_onshift_dashers | 90.0 | 41.0 | 131.0 | 7.0 | 91.0 | 123.0 | 53.0 |
| total_busy_dashers | 88.0 | 31.0 | 123.0 | 5.0 | 75.0 | 91.0 | 53.0 |
| total_outstanding_orders | 109.0 | 31.0 | 197.0 | 4.0 | 167.0 | 176.0 | 102.0 |
| estimated_order_place_duration | 251 | 251 | 251 | 251 | 446 | 446 | 251 |
| estimated_store_to_consumer_driving_duration | 572.0 | 333.0 | 723.0 | 488.0 | 770.0 | 862.0 | 433.0 |
| created_at_pst | 2015-01-31 17:21:29-08:00 | 2015-01-31 17:36:33-08:00 | 2015-02-07 18:54:42-08:00 | 2015-02-10 13:51:54-08:00 | 2015-02-14 18:15:45-08:00 | 2015-02-14 18:21:42-08:00 | 2015-02-15 17:52:49-08:00 |
| created_day_of_week | 7 | 7 | 7 | 2 | 7 | 7 | 1 |
| created_month | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| created_hour | 1 | 1 | 2 | 21 | 2 | 2 | 1 |
| created_hour_pst | 17 | 17 | 18 | 13 | 18 | 18 | 17 |
| time_slot_pst | (15.0, 17.0] | (15.0, 17.0] | (17.0, 18.0] | (12.0, 15.0] | (17.0, 18.0] | (17.0, 18.0] | (15.0, 17.0] |
| duration | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# drop NaN duration times
df = df[df.duration.notna()].copy()
The other records looks reasonable.
There are negative values in 'min_item_price',total_onshift_dashers','total_busy_dashers', and 'total_outstanding_orders', which doesn't make sense. Since they are few, I set them to NaN.
for v in ['min_item_price','total_onshift_dashers','total_busy_dashers', 'total_outstanding_orders']:
print(f"There are {df[df[v]<0].shape[0]} obs with negative values of {v}.\n")
There are 13 obs with negative values of min_item_price. There are 21 obs with negative values of total_onshift_dashers. There are 21 obs with negative values of total_busy_dashers. There are 44 obs with negative values of total_outstanding_orders.
For now, I will change them to nan, but if I can, I should check if these entries are incorrect.
for v in ['min_item_price','total_onshift_dashers','total_busy_dashers', 'total_outstanding_orders']:
df.loc[df[v]<0,v] = np.nan
df_submit.loc[df_submit[v]<0,v] = np.nan
The first thing is to check if there are many missing values.
The only concerns are the high percentage (8%) of missing values for total_onshift_dashers, total_busy_dashers and total_outstanding_orders.
df.isna().mean()*100
market_id 0.499954 created_at 0.000000 actual_delivery_time 0.000000 store_id 0.000000 store_primary_category 2.411128 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 8.247475 total_busy_dashers 8.247475 total_outstanding_orders 8.259125 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.266440 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 duration 0.000000 dtype: float64
df_submit.isna().mean()*100
market_id 0.456388 created_at 0.000000 store_id 0.000000 store_primary_category 2.451714 order_protocol 0.516631 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.007302 max_item_price 0.000000 total_onshift_dashers 8.461426 total_busy_dashers 8.476031 total_outstanding_orders 8.481507 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.020081 delivery_id 0.000000 platform 0.000000 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 dtype: float64
Is is because zeros values are regarded as missing? Let's see:
for x in ['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']:
print(df[x].value_counts())
0.0 3614
18.0 2924
15.0 2912
21.0 2841
19.0 2824
...
165.0 1
159.0 1
164.0 1
163.0 1
171.0 1
Name: total_onshift_dashers, Length: 168, dtype: int64
0.0 4170
10.0 3114
13.0 3052
6.0 3040
18.0 3001
...
150.0 2
152.0 2
153.0 1
154.0 1
149.0 1
Name: total_busy_dashers, Length: 154, dtype: int64
0.0 4110
9.0 2744
10.0 2705
8.0 2685
6.0 2672
...
260.0 1
265.0 1
268.0 1
273.0 1
285.0 1
Name: total_outstanding_orders, Length: 275, dtype: int64
Nope! Zeros are in the data. So, we may need to impute them later. But before that, it is crucial to determien if it is missing completely at random, at random or not at random. We could infer it from the mean duration of obs with missing and non-missing values.
# check the distribution of y for missing values to see if it is (potentially) missing completely at random
for v in ['market_id','store_primary_category','order_protocol','min_item_price',
'total_onshift_dashers','total_busy_dashers',
'total_outstanding_orders','estimated_store_to_consumer_driving_duration']:
missing_sample = df.loc[df[v].isna(),'duration']
valid_sample = df.loc[df[v].notna(),'duration']
print(f"The average duration is {missing_sample.mean():2.0f} when {v} is missing, and {valid_sample.mean():2.0f} otherwise.")
#perform two sample t-test with unequal variances
t_stat, p_val = stats.ttest_ind(missing_sample, valid_sample, equal_var=False,nan_policy='omit')
print(f"The two sample t-test of equal mean: t-stat {t_stat:2.2f} and p-value {p_val:2.2f}\n")
The average duration is 2878 when market_id is missing, and 2861 otherwise. The two sample t-test of equal mean: t-stat 0.46 and p-value 0.65 The average duration is 2923 when store_primary_category is missing, and 2860 otherwise. The two sample t-test of equal mean: t-stat 3.67 and p-value 0.00 The average duration is 2811 when order_protocol is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -1.48 and p-value 0.14 The average duration is 2742 when min_item_price is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -0.43 and p-value 0.67 The average duration is 2853 when total_onshift_dashers is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -1.03 and p-value 0.30 The average duration is 2852 when total_busy_dashers is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -1.10 and p-value 0.27 The average duration is 2853 when total_outstanding_orders is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -1.00 and p-value 0.32 The average duration is 2671 when estimated_store_to_consumer_driving_duration is missing, and 2862 otherwise. The two sample t-test of equal mean: t-stat -4.66 and p-value 0.00
Summary
The duration seems unaffected by the missingness of 'market_id','order_protocol','min_item_price','min_item_price','total_onshift_dashers','total_busy_dashers', or 'total_outstanding_orders'.
For 'estimated_store_to_consumer_driving_duration', the difference in average duration between missing and valid samples (per this variable) is 3.2 minutes. Albeit statistically significant, it is not economically diferrent (plus only 526 of them are missing).
Similar story for 'store_primary_category', where the difference is only 1.0 minute.
Therefore, we could regard them as either MCAR or MAR, which makes imputation easier.
def create_additional_vars(df):
df['total_free_dasher'] = df['total_onshift_dashers'] - df['total_busy_dashers']
df['total_order_per_onshift_dasher'] = df['total_outstanding_orders']/df['total_onshift_dashers']
df['total_free_dasher_percent'] = df['total_free_dasher']/df['total_onshift_dashers']
df['create_at_date'] = df['created_at'].dt.date
# devided by zero; this is also related to censoring: when onshift_dasher =0, we don't know
# what is the representation of market capacity
df[['total_order_per_onshift_dasher',
'total_free_dasher_percent']] = df[['total_order_per_onshift_dasher',
'total_free_dasher_percent']].replace([np.inf, -np.inf], np.nan)
return df.sort_values('created_at')
df = create_additional_vars(df)
df_submit = create_additional_vars(df_submit)
So mainly from 2015-01-21 to 2015-02-18, about 28 days of data. If I use previous 7 days (smooth out day-of-the-week seasonality) average delivery time, then I lose 25% of the data. Therefore, I will not use historical moving average here.
Let's see what variables do we have at this step!
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 197418 entries, 43519 to 61787 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 196431 non-null category 1 created_at 197418 non-null datetime64[ns, UTC] 2 actual_delivery_time 197418 non-null datetime64[ns, UTC] 3 store_id 197418 non-null category 4 store_primary_category 192658 non-null category 5 order_protocol 196423 non-null category 6 total_items 197418 non-null int64 7 subtotal 197418 non-null int64 8 num_distinct_items 197418 non-null int64 9 min_item_price 197405 non-null float64 10 max_item_price 197418 non-null int64 11 total_onshift_dashers 181136 non-null float64 12 total_busy_dashers 181136 non-null float64 13 total_outstanding_orders 181113 non-null float64 14 estimated_order_place_duration 197418 non-null int64 15 estimated_store_to_consumer_driving_duration 196892 non-null float64 16 created_at_pst 197418 non-null datetime64[ns, America/Los_Angeles] 17 created_day_of_week 197418 non-null category 18 created_month 197418 non-null category 19 created_hour 197418 non-null category 20 created_hour_pst 197418 non-null category 21 time_slot_pst 197418 non-null category 22 duration 197418 non-null float64 23 total_free_dasher 181116 non-null float64 24 total_order_per_onshift_dasher 177481 non-null float64 25 total_free_dasher_percent 177505 non-null float64 26 create_at_date 197418 non-null object dtypes: category(9), datetime64[ns, America/Los_Angeles](1), datetime64[ns, UTC](2), float64(9), int64(5), object(1) memory usage: 30.8+ MB
The duration is obviously skewed with a long right-tail. After taking log, the distribution becomes more symmetric.
fig,ax = plt.subplots(figsize=(8,4),dpi=120)
sns.histplot(x='duration',data =df,ax = ax,kde=True)
ax.set_title(f'Overall Duration Distribution')
# ax.set_xscale('log')
plt.tight_layout()
fig,ax = plt.subplots(figsize=(8,4),dpi=120)
sns.histplot(x='duration',data =df,ax = ax,kde=True)
ax.set_title(f'Overall Duration Distribution (log scale)')
ax.set_xscale('log')
plt.tight_layout()
cat_list = ['market_id','order_protocol','created_day_of_week','created_month','created_hour','store_primary_category',
'created_hour_pst','time_slot_pst'] # store_id too many
fig,ax = plt.subplots(int(len(cat_list)/2),2,figsize=(12,1.5*len(cat_list)),dpi=150)
ax = ax.flatten()
for i,arg in enumerate(cat_list):
sns.countplot(x=arg,data = df,ax = ax[i])
ax[i].set_title(f'Count of Records by Categeory of {arg}')
if arg == 'store_primary_category':
ax[i].tick_params(axis='x', rotation=90,labelsize=6)
plt.tight_layout()
Very few orders for protocol 6 and 7.
df.order_protocol.value_counts()
1 54722 3 53196 5 44288 2 24051 4 19353 6 794 7 19 Name: order_protocol, dtype: int64
Duration seems to slightly affected by market_id, created_day_of_week, created_month and order_protocol.
It is higher at created_hour 8, but the later is due to lower available dashers and low sample size.
store_id, hour, and primary category have high cardinality, I will use (cross-validated) target encoding for them.
fig,ax = plt.subplots(int(len(cat_list)/2),2,figsize=(12,1.5*len(cat_list)),dpi=150)
ax = ax.flatten()
for i,arg in enumerate(cat_list):
sns.boxplot(y='duration',x = arg,data =df.query("created_month!=10"),ax = ax[i],showfliers=False)
# ax[i].tick_params(axis='x', rotation=20)
# ax[i].set_ylim((0,df.duration.max()))
ax[i].set_xlabel(arg)
if arg == 'store_primary_category':
ax[i].tick_params(axis='x', rotation=90,labelsize=6)
fig.suptitle('Duration in Seconds (outliers omitted)')
plt.tight_layout()
fig,ax = plt.subplots(figsize=(4,3),dpi=120)
sns.boxplot(y='total_free_dasher',x = 'created_hour_pst',data =df[['total_free_dasher','created_hour_pst']],ax = ax,showfliers=False)
ax.tick_params(axis='x', rotation=20)
ax.set_xlabel('created_hour')
plt.tight_layout()
Note that the derived field total_free_dasher and total_free_dasher_percent are sometimes negative. Does this mean the dashers are "over-subscribed"?
df.describe(include='number',percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| total_items | 197418.0 | 3.196375 | 2.666567 | 1.0 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 6.000000 | 411.0 |
| subtotal | 197418.0 | 2682.325629 | 1823.109543 | 0.0 | 995.000000 | 1400.000000 | 2200.000000 | 3395.000000 | 4930.000000 | 27100.0 |
| num_distinct_items | 197418.0 | 2.670780 | 1.630266 | 1.0 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 5.000000 | 20.0 |
| min_item_price | 197405.0 | 686.267916 | 522.027355 | 0.0 | 175.000000 | 299.000000 | 595.000000 | 949.000000 | 1295.000000 | 14700.0 |
| max_item_price | 197418.0 | 1159.587094 | 558.417027 | 0.0 | 599.000000 | 800.000000 | 1095.000000 | 1395.000000 | 1795.000000 | 14700.0 |
| total_onshift_dashers | 181136.0 | 44.812373 | 34.524228 | 0.0 | 7.000000 | 17.000000 | 37.000000 | 65.000000 | 98.000000 | 171.0 |
| total_busy_dashers | 181136.0 | 41.743922 | 32.143572 | 0.0 | 6.000000 | 15.000000 | 34.000000 | 62.000000 | 90.000000 | 154.0 |
| total_outstanding_orders | 181113.0 | 58.062823 | 52.657901 | 0.0 | 7.000000 | 17.000000 | 41.000000 | 85.000000 | 139.000000 | 285.0 |
| estimated_order_place_duration | 197418.0 | 308.560131 | 90.139692 | 0.0 | 251.000000 | 251.000000 | 251.000000 | 446.000000 | 446.000000 | 2715.0 |
| estimated_store_to_consumer_driving_duration | 196892.0 | 545.356993 | 219.354817 | 0.0 | 253.000000 | 382.000000 | 544.000000 | 702.000000 | 833.000000 | 2088.0 |
| duration | 197418.0 | 2861.582323 | 1164.087414 | 101.0 | 1699.000000 | 2104.000000 | 2660.000000 | 3381.000000 | 4235.000000 | 57032.0 |
| total_free_dasher | 181116.0 | 3.068718 | 11.421770 | -64.0 | -4.000000 | 0.000000 | 1.000000 | 6.000000 | 14.000000 | 86.0 |
| total_order_per_onshift_dasher | 177481.0 | 1.219184 | 0.469606 | 0.0 | 0.705882 | 0.941176 | 1.200000 | 1.478873 | 1.740741 | 47.0 |
| total_free_dasher_percent | 177505.0 | 0.049558 | 0.401570 | -30.0 | -0.150000 | 0.000000 | 0.037736 | 0.173077 | 0.339623 | 1.0 |
num_list = df.select_dtypes(include=np.number).drop('duration',axis=1).columns.values
These variables seem very important:
fig,ax = plt.subplots(len(num_list),2,figsize=(10,3*len(num_list)),dpi=120)
for i,arg in enumerate(num_list):
df_plot = df[['duration', arg]].dropna()
if arg == 'total_free_dasher_percent':
df_plot = df_plot.query('total_free_dasher_percent>-.15')
df_plot['log_duration'] = np.log(df_plot['duration'])
sns.scatterplot(x=arg,y='log_duration',data = df_plot,ax = ax[i,0],s=2)
sns.histplot(df_plot[arg].dropna(),ax = ax[i,1],kde = False)
plt.tight_layout()
There are two groups of highly correlated features (correlation > 0.6):
To avoid singular matrix inversion in linear regression, I will therefore use Ridge regression later to improve numerical stability.
fig,ax = plt.subplots(figsize = (13,13),dpi = 120)
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.heatmap(df[num_list.tolist() + ['duration']].dropna().corr(),
cmap = cmap, ax=ax, annot=True, linewidth=0.1,square = True);
I will use the following list to keep track of the variables
candidates = df.drop(['created_hour_pst','created_at_pst','time_slot_pst'],axis=1).columns.values.tolist()
candidates
['market_id', 'created_at', 'actual_delivery_time', 'store_id', 'store_primary_category', 'order_protocol', 'total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price', 'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_order_place_duration', 'estimated_store_to_consumer_driving_duration', 'created_day_of_week', 'created_month', 'created_hour', 'duration', 'total_free_dasher', 'total_order_per_onshift_dasher', 'total_free_dasher_percent', 'create_at_date']
The target encoding code is adapted from this link. Note that I don't want to use TimeSeriesSplit for target encoding, because the latest time series fold is nevered used, and the earlier ones used multiple times.
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,
cv_fold=KFold(n_splits = 5, shuffle = True),
verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.cv_fold = cv_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
col_mean_name = self.colnames + '_' + 'encoded'
X[col_mean_name] = np.nan
for tr_ind, val_ind in self.cv_fold.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames]\
.map(X_tr.groupby(self.colnames)[self.targetName].mean()).astype(float)
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
X[[self.targetName,col_mean_name]].corr().values[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean_of_encoded_target = self.train[self.encodedName].mean()
mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().iloc[:,0].to_dict()
X[self.encodedName] = X[self.colNames].map(mean).astype(float)
X[self.encodedName].fillna(mean_of_encoded_target, inplace = True)
return X
There are three high-cardinality features:
I will use target encoding with CV to encode these variables.
high_card = ['store_id','store_primary_category','created_hour']
The distribution of duration seems wide, so we have some variation.
fig,ax = plt.subplots(len(high_card),1,figsize=(6,3*len(high_card)),dpi=120)
ax = ax.flatten()
for i,arg in enumerate(high_card):
df_plot = df.groupby(arg)['duration'].mean().reset_index()
sns.histplot(x='duration',data =df_plot,ax = ax[i],kde=True)
ax[i].set_title(f'Duration Distribution by {arg}')
ax[i].set_xscale('log')
plt.tight_layout()
Checking the # of orders per category. Using CV target encoding, if a category has low # of obs, it will just be encoded (closer to) the overall average duration.
df.groupby('store_id')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per store'}).describe(percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| # of orders per store | 6743.0 | 29.277473 | 56.484772 | 1.0 | 2.0 | 4.0 | 11.0 | 29.0 | 71.8 | 937.0 |
df.groupby('store_primary_category')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per primary category'}).describe(percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| # of orders per primary category | 74.0 | 2603.486486 | 4287.071441 | 1.0 | 22.6 | 81.0 | 519.5 | 2728.0 | 8351.0 | 19399.0 |
df.groupby('created_hour')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per created_hour'})
| created_hour | # of orders per created_hour | |
|---|---|---|
| 0 | 0 | 12669 |
| 1 | 1 | 28187 |
| 2 | 2 | 36972 |
| 3 | 3 | 27068 |
| 4 | 4 | 15250 |
| 5 | 5 | 7095 |
| 6 | 6 | 1416 |
| 7 | 7 | 11 |
| 8 | 8 | 1 |
| 9 | 14 | 40 |
| 10 | 15 | 538 |
| 11 | 16 | 2109 |
| 12 | 17 | 3413 |
| 13 | 18 | 5100 |
| 14 | 19 | 13541 |
| 15 | 20 | 15560 |
| 16 | 21 | 11464 |
| 17 | 22 | 8821 |
| 18 | 23 | 8163 |
for var in high_card:
targetc = KFoldTargetEncoderTrain(var,'duration')
df = targetc.fit_transform(df.sort_values('created_at'))
test_targetc = KFoldTargetEncoderTest(df,var,var+'_encoded')
df_submit = test_targetc.fit_transform(df_submit)
Correlation between the new feature, store_id_encoded and, duration is 0.2963364667420029. Correlation between the new feature, store_primary_category_encoded and, duration is 0.1175805954246732. Correlation between the new feature, created_hour_encoded and, duration is 0.25377690506294087.
for var in high_card:
print(df_submit[var+'_encoded'].isna().mean())
0.0 0.0 0.0
fig,axes = plt.subplots(len(high_card),1,figsize=(6,4*len(high_card)),dpi=120)
for ax,var in zip(axes, high_card):
df[[var,var+'_encoded']].groupby(var).mean().hist(bins=50,ax=ax,grid=False)
ax.set_title(f"Histogram of Target-Encoded {var}")
plt.tight_layout()
fig,axes = plt.subplots(3,1,figsize=(8,4*3),dpi=120)
for ax,var in zip(axes, ['store_primary_category','created_hour','created_hour']):
df[[var,var+'_encoded']].groupby(var).mean().plot.bar(ax=ax)
ax.set_title(f"Average of Target-Encoded {var}")
plt.tight_layout()
**Feature Engineering 1**
for var in high_card:
candidates.remove(var)
candidates.append(var+"_encoded")
df[candidates].select_dtypes('category').columns.values.tolist()
['market_id', 'order_protocol', 'created_day_of_week', 'created_month']
I can use one-hot encoding, but since lightgbm handles these integer-coded variables well, I will keep them as-is.
The following numeric variables are also measuring market capcity, yet some
One way using dimension reduction to combine them (e.g., PCA). But it only considers linear combination thereof. Here, I will use a different method:
bin_vars = ['total_free_dasher','total_free_dasher_percent']
# corr coeff < 0.1
binned_vars = [var+'_band' for var in bin_vars]
df[bin_vars].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9]).T
| count | mean | std | min | 10% | 20% | 30% | 40% | 50% | 60% | 70% | 80% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| total_free_dasher | 181116.0 | 3.068718 | 11.42177 | -64.0 | -4.00 | -1.000000 | 0.0 | 0.0 | 1.000000 | 3.000000 | 4.000000 | 8.00000 | 14.000000 | 86.0 |
| total_free_dasher_percent | 177505.0 | 0.049558 | 0.40157 | -30.0 | -0.15 | -0.027778 | 0.0 | 0.0 | 0.037736 | 0.076923 | 0.136364 | 0.21875 | 0.339623 | 1.0 |
def band_features(df):
"""Put features into different bands"""
df['total_free_dasher_band'] = np.nan
df['total_free_dasher_percent_band'] = np.nan
df.loc[df.total_free_dasher<=1,'total_free_dasher_band'] = 'leq_1'
df.loc[np.logical_and(df.total_free_dasher>1,df.total_free_dasher<=3),'total_free_dasher_band'] = '1_to_3'
df.loc[np.logical_and(df.total_free_dasher>3,df.total_free_dasher<=5),'total_free_dasher_band'] = '3_to_5'
df.loc[np.logical_and(df.total_free_dasher>5,df.total_free_dasher<=8),'total_free_dasher_band'] = '5_to_8'
df.loc[np.logical_and(df.total_free_dasher>8,df.total_free_dasher<=14),'total_free_dasher_band'] = '8_to_14'
df.loc[df.total_free_dasher>14,'total_free_dasher_band'] = 'gt_14'
df.total_free_dasher_band = pd.Categorical(df.total_free_dasher_band,
ordered=True,
categories=['leq_1','1_to_3','3_to_5','5_to_8','8_to_14','gt_14'])
df.loc[df.total_free_dasher_percent<=0.04,'total_free_dasher_percent_band'] = 'leq_4pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.04,
df.total_free_dasher_percent<=0.08),'total_free_dasher_percent_band'] = '4_to_8pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.08,
df.total_free_dasher_percent<=0.14),'total_free_dasher_percent_band'] = '8_to_14pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.14,
df.total_free_dasher_percent<=0.22),'total_free_dasher_percent_band'] = '14_to_22pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.22,
df.total_free_dasher_percent<=0.34),'total_free_dasher_percent_band'] = '22_to_34pct'
df.loc[df.total_free_dasher_percent>0.34,'total_free_dasher_percent_band'] = 'gt_34pct'
df.total_free_dasher_percent_band = pd.Categorical(df.total_free_dasher_percent_band,
ordered=True,
categories=['leq_4pct','4_to_8pct',
'8_to_14pct','14_to_22pct','22_to_34pct','gt_34pct'])
# df['total_onshift_dashers_band'] = pd.qcut(df['total_order_per_onshift_dasher'],10)
# df['min_item_price_band'] = pd.qcut(df['min_item_price'],10)
return df
df = band_features(df)
df_submit = band_features(df_submit)
fig,ax = plt.subplots(len(binned_vars),1,figsize=(5,5*len(binned_vars)),dpi=120)
for i,arg in enumerate(binned_vars):
sns.boxplot(y='duration',x = arg,data =df,ax = ax[i],showfliers=False)
ax[i].tick_params(axis='x', rotation=20)
# ax[i].set_ylim((0,df.duration.max()))
ax[i].set_xlabel(arg)
fig.suptitle('Duration in Seconds (outliers omitted)')
plt.tight_layout()
There are some ordering relationship between them and duration. In this case, I will use target encoding again for help ML learn this feature
for var in binned_vars:
targetc = KFoldTargetEncoderTrain(var,'duration')
df = targetc.fit_transform(df.sort_values('created_at'))
test_targetc = KFoldTargetEncoderTest(df,var,var+'_encoded')
df_submit = test_targetc.fit_transform(df_submit)
Correlation between the new feature, total_free_dasher_band_encoded and, duration is 0.12967696912856888. Correlation between the new feature, total_free_dasher_percent_band_encoded and, duration is 0.1741369326129191.
fig,axes = plt.subplots(len(binned_vars),1,figsize=(8,4*len(binned_vars)),dpi=120)
for ax,var in zip(axes, binned_vars):
df[[var,var+'_encoded']].groupby(var).mean().plot.bar(ax=ax)
ax.set_title(f"Average of Target-Encoded {var}")
plt.tight_layout()
And we also solved for their missing values problem! Nice!
df[[x+'_encoded' for x in binned_vars]].isna().mean()
total_free_dasher_band_encoded 0.0 total_free_dasher_percent_band_encoded 0.0 dtype: float64
df_submit[[x+'_encoded' for x in binned_vars]].isna().mean()
total_free_dasher_band_encoded 0.0 total_free_dasher_percent_band_encoded 0.0 dtype: float64
**Feature Engineering 2**
for var in binned_vars:
candidates.append(var+"_encoded")
for var in bin_vars:
candidates.remove(var)
Market ID is an important identifier, and will affect subsequent analysis. For simplicity, I will fill it with the most frequent category. This is ok, because less than 0.5% of market id is missing, and there is not difference in average duration.
df[candidates].isna().mean()*100
market_id 0.499954 created_at 0.000000 actual_delivery_time 0.000000 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 8.247475 total_busy_dashers 8.247475 total_outstanding_orders 8.259125 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.266440 created_day_of_week 0.000000 created_month 0.000000 duration 0.000000 total_order_per_onshift_dasher 10.098876 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 dtype: float64
def fill_mode(df,var):
df = df.copy()
df[var].fillna(df[var].mode().iloc[0],inplace=True)
return df
df = fill_mode(df,'market_id')
df_submit = fill_mode(df_submit,'market_id')
The only variables with significant percentage missing are 'total_onshift_dashers',total_busy_dashers','total_outstanding_orders and 'total_order_per_onshift_dasher'.
The rest I will just left for lightgbm to handle.
I will use the IterativeImputer in Scikit-Learn. However, I need to one-hot encode the remaining cat variables first.
drop_list=['duration','created_at','actual_delivery_time','create_at_date','created_month'] # since the test set only has one month
cats = ['market_id', 'order_protocol', 'created_day_of_week','time_slot_pst']
to_impute = ['total_busy_dashers','total_outstanding_orders','total_order_per_onshift_dasher','total_onshift_dashers',
'estimated_store_to_consumer_driving_duration']
def impute_missing(df,cats,candidates,drop_list,to_impute,estimator=None,imputer=None):
"""
Use iterative imputer to impute missing values
cats: list of categorical var names
candidates: all relevant vars
drop_list: vars to drop before passing to imputation
to_impute: list of vars to impute
"""
from sklearn.linear_model import BayesianRidge
df=df.copy()
## create dummy variables
df_dummy = pd.concat([df[candidates].drop(cats+drop_list,axis=1,errors='ignore'),
pd.get_dummies(df[cats],drop_first = False,dummy_na=True)],axis=1)
if imputer is None:
if estimator is None:
# estimator = lgb.LGBMRegressor(random_state=7)
estimator = BayesianRidge(normalize=True)
imputer = IterativeImputer(random_state=11, estimator=estimator,skip_complete=True)
imputer.fit(df_dummy)
X_new = imputer.transform(df_dummy)
for i,var in enumerate(df_dummy.columns):
if var in to_impute:
df[var] = np.where(df[var].isna(),X_new[:,i],df[var])
return df,imputer
df_imputed,imputer = impute_missing(df,cats,candidates,drop_list,to_impute,
estimator=lgb.LGBMRegressor(random_state=7),
imputer=None)
df_imputed[to_impute].hist();
df[to_impute].hist();
df_imputed[to_impute].describe()
| total_busy_dashers | total_outstanding_orders | total_order_per_onshift_dasher | total_onshift_dashers | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|
| count | 197418.000000 | 197418.000000 | 197418.000000 | 197418.000000 | 197418.000000 |
| mean | 40.750271 | 57.027429 | 1.221164 | 43.745263 | 545.342570 |
| std | 30.972641 | 50.571968 | 0.476138 | 33.264701 | 219.071562 |
| min | 0.000000 | 0.000000 | -0.045821 | 0.000000 | 0.000000 |
| 25% | 17.000000 | 19.000000 | 0.953125 | 19.000000 | 383.000000 |
| 50% | 31.000000 | 43.268230 | 1.238095 | 33.000000 | 544.000000 |
| 75% | 59.000000 | 80.000000 | 1.476190 | 62.000000 | 702.000000 |
| max | 154.000000 | 285.000000 | 47.000000 | 171.000000 | 2088.000000 |
df[to_impute].describe()
| total_busy_dashers | total_outstanding_orders | total_order_per_onshift_dasher | total_onshift_dashers | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|
| count | 181136.000000 | 181113.000000 | 177481.000000 | 181136.000000 | 196892.000000 |
| mean | 41.743922 | 58.062823 | 1.219184 | 44.812373 | 545.356993 |
| std | 32.143572 | 52.657901 | 0.469606 | 34.524228 | 219.354817 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 15.000000 | 17.000000 | 0.941176 | 17.000000 | 382.000000 |
| 50% | 34.000000 | 41.000000 | 1.200000 | 37.000000 | 544.000000 |
| 75% | 62.000000 | 85.000000 | 1.478873 | 65.000000 | 702.000000 |
| max | 154.000000 | 285.000000 | 47.000000 | 171.000000 | 2088.000000 |
df_submit_imputed,_ = impute_missing(df_submit,cats,
candidates=list(set(candidates)-set(['actual_delivery_time', 'duration'])),
drop_list=drop_list+['delivery_id','platform'],
to_impute=to_impute,imputer=imputer)
df_imputed[candidates].isna().mean()*100
market_id 0.000000 created_at 0.000000 actual_delivery_time 0.000000 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 0.000000 total_busy_dashers 0.000000 total_outstanding_orders 0.000000 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.000000 created_day_of_week 0.000000 created_month 0.000000 duration 0.000000 total_order_per_onshift_dasher 0.000000 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 dtype: float64
df = df_imputed.copy()
df_submit = df_submit_imputed.copy()
del df_imputed,df_submit_imputed
Time series data usually have high auto-correlation. Historical averages are thus very useful.
calucalte moving averages in the last week by market_id and time_slot
df['created_week_pst'] = df['created_at_pst'].dt.week
df_submit['created_week_pst'] = df_submit['created_at_pst'].dt.week
df['created_month_pst'] = df['created_at_pst'].dt.month
df_submit['created_month_pst'] = df_submit['created_at_pst'].dt.month
seg_vars = ['market_id','time_slot_pst']
order_vars = ['created_week_pst']
# combine all the date time
df_agg = df[seg_vars+order_vars+['duration']].groupby(seg_vars+order_vars).agg(mean_duration=('duration','mean')).reset_index()
# extend one more week
df_temp = df_submit[seg_vars+order_vars].copy()
df_temp['created_week_pst'] += 1
df_all = pd.concat([df[seg_vars+order_vars],df_submit[seg_vars+order_vars],df_temp]).drop_duplicates().reset_index(drop=True)\
.merge(df_agg,how='left',validate="1:1")
def fill_mean_duration(df_all,seg_vars=None,order_vars=None,forward_fill = True):
"""
This function fills the missing values of mean duration
"""
if seg_vars is None:
seg_vars = ['market_id','time_slot_pst']
if order_vars is None:
order_vars = ['created_week_pst']
df_filled = df_all.copy()
if forward_fill:
# forward fill by segment
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars).groupby(seg_vars)\
['mean_duration'].transform(lambda x: x.ffill())
else:
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars).groupby(seg_vars)\
['mean_duration'].transform(lambda x: x.bfill())
# if missing, fill by average duration for that time_slot
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars)\
.groupby(['time_slot_pst'])['mean_duration'].transform(lambda x: x.fillna(x.mean()))
# if missing, fill by average duration for that market
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars)\
.groupby(['market_id'])['mean_duration'].transform(lambda x: x.fillna(x.mean()))
return df_filled
df_filled = fill_mean_duration(df_all,seg_vars,order_vars,forward_fill = True)
df_filled[df_filled.market_id==1].sort_values(seg_vars+order_vars)
| market_id | time_slot_pst | created_week_pst | mean_duration | |
|---|---|---|---|---|
| 0 | 1 | (-0.001, 12.0] | 4 | 2590.043385 |
| 36 | 1 | (-0.001, 12.0] | 5 | 2958.138080 |
| 72 | 1 | (-0.001, 12.0] | 6 | 2882.142513 |
| 109 | 1 | (-0.001, 12.0] | 7 | 3017.765534 |
| 144 | 1 | (-0.001, 12.0] | 8 | 3177.005725 |
| 180 | 1 | (5.999, 12.0] | 8 | 3044.662812 |
| 187 | 1 | (5.999, 12.0] | 9 | 3044.662812 |
| 223 | 1 | (5.999, 12.0] | 10 | 3044.662812 |
| 8 | 1 | (12.0, 15.0] | 4 | 2783.577386 |
| 42 | 1 | (12.0, 15.0] | 5 | 2948.539454 |
| 81 | 1 | (12.0, 15.0] | 6 | 2799.499652 |
| 114 | 1 | (12.0, 15.0] | 7 | 2845.494608 |
| 152 | 1 | (12.0, 15.0] | 8 | 2726.836478 |
| 193 | 1 | (12.0, 15.0] | 9 | 2726.836478 |
| 229 | 1 | (12.0, 15.0] | 10 | 2726.836478 |
| 14 | 1 | (15.0, 17.0] | 4 | 3045.468819 |
| 49 | 1 | (15.0, 17.0] | 5 | 3214.773333 |
| 89 | 1 | (15.0, 17.0] | 6 | 3139.322804 |
| 121 | 1 | (15.0, 17.0] | 7 | 3128.961026 |
| 160 | 1 | (15.0, 17.0] | 8 | 2865.455338 |
| 200 | 1 | (15.0, 17.0] | 9 | 2865.455338 |
| 236 | 1 | (15.0, 17.0] | 10 | 2865.455338 |
| 22 | 1 | (17.0, 18.0] | 4 | 3016.304312 |
| 54 | 1 | (17.0, 18.0] | 5 | 3622.680925 |
| 92 | 1 | (17.0, 18.0] | 6 | 3590.994481 |
| 129 | 1 | (17.0, 18.0] | 7 | 3598.500535 |
| 164 | 1 | (17.0, 18.0] | 8 | 3237.492925 |
| 206 | 1 | (17.0, 18.0] | 9 | 3237.492925 |
| 242 | 1 | (17.0, 18.0] | 10 | 3237.492925 |
| 25 | 1 | (18.0, 19.0] | 4 | 2757.426989 |
| 61 | 1 | (18.0, 19.0] | 5 | 3383.176277 |
| 96 | 1 | (18.0, 19.0] | 6 | 3622.907042 |
| 132 | 1 | (18.0, 19.0] | 7 | 3498.480090 |
| 172 | 1 | (18.0, 19.0] | 8 | 3386.102857 |
| 213 | 1 | (18.0, 19.0] | 9 | 3386.102857 |
| 249 | 1 | (18.0, 19.0] | 10 | 3386.102857 |
| 33 | 1 | (19.0, 23.0] | 4 | 2243.327497 |
| 66 | 1 | (19.0, 23.0] | 5 | 2659.729921 |
| 105 | 1 | (19.0, 23.0] | 6 | 2892.423384 |
| 141 | 1 | (19.0, 23.0] | 7 | 2782.944875 |
| 177 | 1 | (19.0, 23.0] | 8 | 2979.740351 |
| 216 | 1 | (19.0, 23.0] | 9 | 2979.740351 |
| 252 | 1 | (19.0, 23.0] | 10 | 2979.740351 |
df_all[df_filled.market_id==1].sort_values(seg_vars+order_vars)
| market_id | time_slot_pst | created_week_pst | mean_duration | |
|---|---|---|---|---|
| 0 | 1 | (-0.001, 12.0] | 4 | 2590.043385 |
| 36 | 1 | (-0.001, 12.0] | 5 | 2958.138080 |
| 72 | 1 | (-0.001, 12.0] | 6 | 2882.142513 |
| 109 | 1 | (-0.001, 12.0] | 7 | 3017.765534 |
| 144 | 1 | (-0.001, 12.0] | 8 | 3177.005725 |
| 180 | 1 | (5.999, 12.0] | 8 | NaN |
| 187 | 1 | (5.999, 12.0] | 9 | NaN |
| 223 | 1 | (5.999, 12.0] | 10 | NaN |
| 8 | 1 | (12.0, 15.0] | 4 | 2783.577386 |
| 42 | 1 | (12.0, 15.0] | 5 | 2948.539454 |
| 81 | 1 | (12.0, 15.0] | 6 | 2799.499652 |
| 114 | 1 | (12.0, 15.0] | 7 | 2845.494608 |
| 152 | 1 | (12.0, 15.0] | 8 | 2726.836478 |
| 193 | 1 | (12.0, 15.0] | 9 | NaN |
| 229 | 1 | (12.0, 15.0] | 10 | NaN |
| 14 | 1 | (15.0, 17.0] | 4 | 3045.468819 |
| 49 | 1 | (15.0, 17.0] | 5 | 3214.773333 |
| 89 | 1 | (15.0, 17.0] | 6 | 3139.322804 |
| 121 | 1 | (15.0, 17.0] | 7 | 3128.961026 |
| 160 | 1 | (15.0, 17.0] | 8 | 2865.455338 |
| 200 | 1 | (15.0, 17.0] | 9 | NaN |
| 236 | 1 | (15.0, 17.0] | 10 | NaN |
| 22 | 1 | (17.0, 18.0] | 4 | 3016.304312 |
| 54 | 1 | (17.0, 18.0] | 5 | 3622.680925 |
| 92 | 1 | (17.0, 18.0] | 6 | 3590.994481 |
| 129 | 1 | (17.0, 18.0] | 7 | 3598.500535 |
| 164 | 1 | (17.0, 18.0] | 8 | 3237.492925 |
| 206 | 1 | (17.0, 18.0] | 9 | NaN |
| 242 | 1 | (17.0, 18.0] | 10 | NaN |
| 25 | 1 | (18.0, 19.0] | 4 | 2757.426989 |
| 61 | 1 | (18.0, 19.0] | 5 | 3383.176277 |
| 96 | 1 | (18.0, 19.0] | 6 | 3622.907042 |
| 132 | 1 | (18.0, 19.0] | 7 | 3498.480090 |
| 172 | 1 | (18.0, 19.0] | 8 | 3386.102857 |
| 213 | 1 | (18.0, 19.0] | 9 | NaN |
| 249 | 1 | (18.0, 19.0] | 10 | NaN |
| 33 | 1 | (19.0, 23.0] | 4 | 2243.327497 |
| 66 | 1 | (19.0, 23.0] | 5 | 2659.729921 |
| 105 | 1 | (19.0, 23.0] | 6 | 2892.423384 |
| 141 | 1 | (19.0, 23.0] | 7 | 2782.944875 |
| 177 | 1 | (19.0, 23.0] | 8 | 2979.740351 |
| 216 | 1 | (19.0, 23.0] | 9 | NaN |
| 252 | 1 | (19.0, 23.0] | 10 | NaN |
Now merge with previous data
df_filled.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 258 entries, 0 to 257 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 258 non-null category 1 time_slot_pst 258 non-null object 2 created_week_pst 258 non-null int64 3 mean_duration 258 non-null float64 dtypes: category(1), float64(1), int64(1), object(1) memory usage: 8.5+ KB
df_filled['created_week_pst'] +=1 # last week so week 4 in df_filled is matched with 5 in df
df2 = df.merge(df_filled,on=seg_vars + order_vars , suffixes=[False,False],validate = "m:1",how='left')
df_submit2 = df_submit.merge(df_filled,on=seg_vars + order_vars , suffixes=[False,False],validate = "m:1",how='left')
**Feature Engineering 3**
candidates.append('mean_duration')
df = fill_mean_duration(df2,seg_vars,order_vars,forward_fill = False)
df_submit = fill_mean_duration(df_submit2,seg_vars,order_vars,forward_fill = False)
df_submit.isna().mean()*100
market_id 0.000000 created_at 0.000000 store_id 0.000000 store_primary_category 2.451714 order_protocol 0.516631 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.007302 max_item_price 0.000000 total_onshift_dashers 0.000000 total_busy_dashers 0.000000 total_outstanding_orders 0.000000 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.000000 delivery_id 0.000000 platform 0.000000 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 total_free_dasher 8.479682 total_order_per_onshift_dasher 0.000000 total_free_dasher_percent 10.153711 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band 8.479682 total_free_dasher_percent_band 10.153711 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 created_week_pst 0.000000 created_month_pst 0.000000 mean_duration 0.000000 dtype: float64
df.isna().mean()*100
market_id 0.000000 created_at 0.000000 actual_delivery_time 0.000000 store_id 0.000000 store_primary_category 2.411128 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 0.000000 total_busy_dashers 0.000000 total_outstanding_orders 0.000000 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.000000 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 duration 0.000000 total_free_dasher 8.257606 total_order_per_onshift_dasher 0.000000 total_free_dasher_percent 10.086720 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band 8.257606 total_free_dasher_percent_band 10.086720 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 created_week_pst 0.000000 created_month_pst 0.000000 mean_duration 0.000000 dtype: float64
I will use two sets of models: one is linear quantile regression, and the other is gradient boosting.
df_with_dummy = pd.concat([df[candidates],
pd.get_dummies(df[['market_id','order_protocol','created_day_of_week','created_month','time_slot_pst']],
drop_first = True,dummy_na=True)],axis=1)
df_with_dummy.fillna(df_with_dummy.mean(),inplace=True)
df_with_dummy.drop(['market_id','order_protocol','created_day_of_week'
,'actual_delivery_time','created_at','create_at_date','time_slot_pst'],axis=1,
inplace=True,errors='ignore')
df_with_dummy.dropna(inplace=True)
X_dummies = df_with_dummy.drop(['duration'],axis=1)
y_dummies = np.log(df_with_dummy['duration'])
ridge_reg = RidgeCV(alphas=np.logspace(2.5, 6.5,base=10, num=40),scoring = 'neg_mean_squared_error',
cv = TimeSeriesSplit(5)).fit(X_dummies, y_dummies)
ridge_reg.alpha_
10926.00861117378
ridge_reg.best_score_
-0.08729867978280151
ridge_reg_res = pd.DataFrame(data={'feature':['intercept'] + X_dummies.columns.values.tolist(),
'coef':[ridge_reg.intercept_]+ridge_reg.coef_.tolist()}).set_index('feature')
ridge_reg_res
| coef | |
|---|---|
| feature | |
| intercept | 5.575644 |
| total_items | 0.001859 |
| subtotal | 0.000027 |
| num_distinct_items | 0.005815 |
| min_item_price | 0.000002 |
| max_item_price | -0.000004 |
| total_onshift_dashers | -0.003141 |
| total_busy_dashers | -0.003582 |
| total_outstanding_orders | 0.004521 |
| estimated_order_place_duration | 0.000154 |
| estimated_store_to_consumer_driving_duration | 0.000425 |
| created_month | 0.004914 |
| total_order_per_onshift_dasher | 0.057567 |
| store_id_encoded | 0.000175 |
| store_primary_category_encoded | 0.000002 |
| created_hour_encoded | 0.000191 |
| total_free_dasher_band_encoded | 0.000141 |
| total_free_dasher_percent_band_encoded | 0.000098 |
| mean_duration | 0.000058 |
| market_id_2 | -0.003960 |
| market_id_3 | 0.007662 |
| market_id_4 | -0.003199 |
| market_id_5 | 0.015056 |
| market_id_6 | -0.023929 |
| market_id_nan | 0.000000 |
| order_protocol_2 | 0.002456 |
| order_protocol_3 | -0.008050 |
| order_protocol_4 | 0.036841 |
| order_protocol_5 | -0.010801 |
| order_protocol_6 | 0.006295 |
| order_protocol_7 | -0.000074 |
| order_protocol_nan | -0.001293 |
| created_day_of_week_2.0 | -0.023630 |
| created_day_of_week_3.0 | -0.039119 |
| created_day_of_week_4.0 | -0.007422 |
| created_day_of_week_5.0 | -0.017323 |
| created_day_of_week_6.0 | 0.013734 |
| created_day_of_week_7.0 | 0.006344 |
| created_day_of_week_nan | 0.000000 |
| created_month_2.0 | 0.004914 |
| created_month_nan | 0.000000 |
I suspect the default parameters are good enough. Nonetheless, I will use Bayesian Optimization to tune the hyperparameters. I will only tune the most important ones.
I first use early stopping to automatically select num_iterations, and focus on the other parameters. This will ususally speeds us tuning.
# df_train[candidates]
X = df.sort_values('created_at')[candidates].drop(['duration','actual_delivery_time','created_at','create_at_date'], axis=1).copy()
y = df.sort_values('created_at')['duration'].copy()
lgb_data = lgb.Dataset(X, y, free_raw_data=False)
categorical_features = X.columns.values[np.where(X.dtypes == 'category')[0]]
print(categorical_features)
['market_id' 'order_protocol' 'created_day_of_week' 'created_month']
# num_iterations will be chosen by early stopping
def lgb_tuning(learning_rate=0.1, num_leaves=34, min_data_in_leaf=500,num_boost_round=500,early_stopping_rounds=18):
params = {
'num_leaves': int(num_leaves),
'learning_rate': learning_rate,
"min_data_in_leaf": int(min_data_in_leaf),
'verbose': -1,
'saved_feature_importance_type':1,
# 'metric': "l2"
}
cv_res = lgb.cv(params,lgb_data,num_boost_round=int(num_boost_round),
folds = TimeSeriesSplit(5),
fobj=qmse,
feval=qmse_metric,
early_stopping_rounds=early_stopping_rounds,
categorical_feature = categorical_features.tolist(),
verbose_eval=False)
return -cv_res['asym_mse-mean'][-1]
lgb_bo = BayesianOptimization(lgb_tuning, { "num_leaves": (20, 100),
'min_data_in_leaf': (10, 1000),
'learning_rate': (0.1, 0.5)},
random_state = 520)
%%time
lgb_bo.maximize(n_iter=50, init_points=10)
| iter | target | learni... | min_da... | num_le... | ------------------------------------------------------------- | 1 | -5.697e+0 | 0.2768 | 215.6 | 21.17 | | 2 | -5.713e+0 | 0.2774 | 658.8 | 44.98 | | 3 | -5.699e+0 | 0.1238 | 380.1 | 36.39 | | 4 | -5.675e+0 | 0.1833 | 985.5 | 80.76 | | 5 | -5.708e+0 | 0.2803 | 64.34 | 23.65 | | 6 | -5.725e+0 | 0.1068 | 38.11 | 57.19 | | 7 | -5.71e+05 | 0.3346 | 599.5 | 37.25 | | 8 | -5.764e+0 | 0.2752 | 31.87 | 59.44 | | 9 | -5.695e+0 | 0.2116 | 218.2 | 41.92 | | 10 | -5.68e+05 | 0.2118 | 959.6 | 52.3 | | 11 | -5.735e+0 | 0.3743 | 981.2 | 76.67 | | 12 | -5.679e+0 | 0.2197 | 64.36 | 23.57 | | 13 | -5.765e+0 | 0.3729 | 165.8 | 56.05 | | 14 | -5.926e+0 | 0.4177 | 11.93 | 87.6 | | 15 | -5.71e+05 | 0.2396 | 599.3 | 36.77 | | 16 | -5.669e+0 | 0.2242 | 838.5 | 58.77 | | 17 | -5.741e+0 | 0.2976 | 959.9 | 52.36 | | 18 | -5.753e+0 | 0.3329 | 218.4 | 41.59 | | 19 | -5.796e+0 | 0.4977 | 959.3 | 52.14 | | 20 | -5.697e+0 | 0.2118 | 370.5 | 50.91 | | 21 | -5.667e+0 | 0.1261 | 884.7 | 87.9 | | 22 | -5.672e+0 | 0.1975 | 658.9 | 44.68 | | 23 | -5.668e+0 | 0.1078 | 817.2 | 65.98 | | 24 | -5.716e+0 | 0.2226 | 370.2 | 51.12 | | 25 | -5.735e+0 | 0.4358 | 884.6 | 88.3 | | 26 | -5.747e+0 | 0.4298 | 658.2 | 45.01 | | 27 | -5.735e+0 | 0.3931 | 812.9 | 66.42 | | 28 | -5.789e+0 | 0.4962 | 370.4 | 50.58 | | 29 | -5.745e+0 | 0.3265 | 369.8 | 51.13 | | 30 | -5.682e+0 | 0.2773 | 104.8 | 27.85 | | 31 | -5.736e+0 | 0.2994 | 625.6 | 45.61 | | 32 | -5.724e+0 | 0.1927 | 261.9 | 75.1 | | 33 | -5.748e+0 | 0.3988 | 838.0 | 58.25 | | 34 | -5.715e+0 | 0.2026 | 369.6 | 70.85 | | 35 | -5.718e+0 | 0.2673 | 724.0 | 31.41 | | 36 | -5.704e+0 | 0.3576 | 190.4 | 46.88 | | 37 | -5.683e+0 | 0.1704 | 216.0 | 21.62 | | 38 | -5.78e+05 | 0.4821 | 531.1 | 77.43 | | 39 | -5.682e+0 | 0.114 | 985.5 | 80.97 | | 40 | -5.681e+0 | 0.1145 | 817.3 | 66.3 | | 41 | -5.73e+05 | 0.2715 | 795.6 | 69.06 | | 42 | -5.7e+05 | 0.1823 | 816.8 | 66.34 | | 43 | -5.709e+0 | 0.145 | 525.0 | 97.21 | | 44 | -5.698e+0 | 0.367 | 380.5 | 36.77 | | 45 | -5.691e+0 | 0.2221 | 104.9 | 28.32 | | 46 | -5.723e+0 | 0.2646 | 739.2 | 87.8 | | 47 | -5.753e+0 | 0.1056 | 62.97 | 84.52 | | 48 | -5.726e+0 | 0.3756 | 874.6 | 70.91 | | 49 | -5.685e+0 | 0.1536 | 884.9 | 87.94 | | 50 | -5.678e+0 | 0.2028 | 599.0 | 36.54 | | 51 | -5.756e+0 | 0.3287 | 189.7 | 47.26 | | 52 | -5.698e+0 | 0.1691 | 529.5 | 93.96 | | 53 | -5.706e+0 | 0.4102 | 902.6 | 47.74 | | 54 | -5.762e+0 | 0.421 | 985.0 | 80.56 | | 55 | -5.689e+0 | 0.2151 | 884.4 | 87.8 | | 56 | -5.674e+0 | 0.1105 | 327.1 | 25.55 | | 57 | -5.677e+0 | 0.15 | 379.9 | 37.09 | | 58 | -5.704e+0 | 0.2633 | 379.9 | 36.62 | | 59 | -5.749e+0 | 0.4517 | 603.5 | 69.66 | | 60 | -5.684e+0 | 0.1983 | 584.9 | 78.44 | ============================================================= CPU times: user 18min 47s, sys: 6min 34s, total: 25min 21s Wall time: 2min 16s
n = len(lgb_bo.res)
df_bo = pd.DataFrame({'target':[lgb_bo.res[i]['target'] for i in range(n)],
'learning_rate':[lgb_bo.res[i]['params']['learning_rate'] for i in range(n)],
'min_data_in_leaf':[lgb_bo.res[i]['params']['min_data_in_leaf'] for i in range(n)],
'num_leaves':[lgb_bo.res[i]['params']['num_leaves'] for i in range(n)]})
df_bo['cum_min'] = [np.min(-df_bo['target'][:i+1]) for i in range(n)]
fig,ax = plt.subplots(figsize = (8,6),dpi=120)
ax.plot(np.arange(n),df_bo['cum_min'],'b.')
ax.plot(np.arange(n),df_bo['cum_min'])
ax.set_xlabel("Number of calls n")
ax.set_ylabel("minimum cost function after n calls")
ax.set_title("Convergence Plot");
lgb_bo.max
{'target': -566691.7530494402,
'params': {'learning_rate': 0.12614126473929443,
'min_data_in_leaf': 884.6807861894162,
'num_leaves': 87.90390221561985}}
lgb_best_param = lgb_bo.max['params']
Given the other parameters, pin down num_iterations (univariate search)
lgb_bo = BayesianOptimization(partial(lgb_tuning,**lgb_best_param,early_stopping_rounds=None), { "num_boost_round": (10, 100)},
random_state = 521)
%%time
lgb_bo.maximize(n_iter=40, init_points=10)
| iter | target | num_bo... | ------------------------------------- | 1 | -5.707e+0 | 73.56 | | 2 | -5.695e+0 | 61.31 | | 3 | -5.726e+0 | 92.35 | | 4 | -5.726e+0 | 92.21 | | 5 | -5.676e+0 | 49.77 | | 6 | -5.679e+0 | 45.97 | | 7 | -5.709e+0 | 76.2 | | 8 | -6.169e+0 | 20.33 | | 9 | -5.676e+0 | 47.98 | | 10 | -5.707e+0 | 72.97 | | 11 | -5.734e+0 | 100.0 | | 12 | -5.685e+0 | 54.57 | | 13 | -5.678e+0 | 48.65 | | 14 | -5.717e+0 | 83.7 | | 15 | -5.667e+0 | 37.17 | | 16 | -5.685e+0 | 33.27 | | 17 | -5.7e+05 | 66.23 | | 18 | -5.675e+0 | 40.53 | | 19 | -5.674e+0 | 35.36 | | 20 | -5.682e+0 | 52.2 | | 21 | -5.691e+0 | 57.95 | | 22 | -5.669e+0 | 38.7 | | 23 | -5.667e+0 | 37.77 | | 24 | -5.667e+0 | 37.47 | | 25 | -5.667e+0 | 37.47 | | 26 | -5.669e+0 | 36.83 | | 27 | -1.081e+0 | 10.0 | | 28 | -5.82e+05 | 25.74 | | 29 | -5.723e+0 | 87.3 | | 30 | -5.73e+05 | 97.0 | | 31 | -5.714e+0 | 80.09 | | 32 | -5.706e+0 | 69.09 | | 33 | -5.675e+0 | 43.16 | | 34 | -5.7e+05 | 63.71 | | 35 | -5.706e+0 | 30.36 | | 36 | -5.673e+0 | 41.95 | | 37 | -5.687e+0 | 56.23 | | 38 | -5.669e+0 | 38.02 | | 39 | -5.667e+0 | 37.4 | | 40 | -5.675e+0 | 50.91 | | 41 | -5.667e+0 | 37.42 | | 42 | -5.667e+0 | 37.41 | | 43 | -5.667e+0 | 37.4 | | 44 | -5.673e+0 | 39.37 | | 45 | -5.667e+0 | 37.55 | | 46 | -5.681e+0 | 44.45 | | 47 | -5.667e+0 | 37.37 | | 48 | -5.667e+0 | 37.58 | | 49 | -5.667e+0 | 37.31 | | 50 | -5.667e+0 | 37.54 | ===================================== CPU times: user 22min 43s, sys: 6min 13s, total: 28min 56s Wall time: 2min 35s
n = len(lgb_bo.res)
df_bo = pd.DataFrame({'target':[lgb_bo.res[i]['target'] for i in range(n)],
'num_boost_round':[lgb_bo.res[i]['params']['num_boost_round'] for i in range(n)]})
df_bo['cum_min'] = [np.min(-df_bo['target'][:i+1]) for i in range(n)]
fig,ax = plt.subplots(figsize = (8,6),dpi=120)
ax.plot(np.arange(n),df_bo['cum_min'],'b.')
ax.plot(np.arange(n),df_bo['cum_min'])
ax.set_xlabel("Number of calls n")
ax.set_ylabel("minimum cost function after n calls")
ax.set_title("Convergence Plot");
lgb_bo.max
{'target': -566691.7530494402,
'params': {'num_boost_round': 37.17330094004068}}
lgb_best_param.update(num_boost_round=int(lgb_bo.max['params']['num_boost_round']))
lgb_best_param
{'learning_rate': 0.12614126473929443,
'min_data_in_leaf': 884.6807861894162,
'num_leaves': 87.90390221561985,
'num_boost_round': 37}
df_cv = df_with_dummy.copy().reset_index()
df_cv['predicted_duration'] = np.nan
for train_fold,cv_fold in TimeSeriesSplit(5).split(df_cv):
df_train_fold = df_cv.iloc[train_fold,:]
df_val_fold = df_cv.iloc[cv_fold,:]
linear_qreg = Ridge(alpha=ridge_reg.alpha_).fit(df_train_fold.drop(['duration','predicted_duration'],axis=1),
np.log(df_train_fold['duration']))
df_cv.loc[cv_fold,'predicted_duration'] = np.exp(linear_qreg.predict(df_val_fold.drop(['duration','predicted_duration'],axis=1)))
def lgb_cv_oos_prediction(params,df,candidates,categorical_features,fobj=qmse,feval=qmse_metric):
"""
This function makes out-of-sample prediction for light gbm models
Inputs:
paras: dictionary of parameters to pass to lgb.train
df: data frame
candidates: list of relevant variables (label+feature)
categorical_features: list of str (names of categorical features)
fobj: callable; objective function
feval: callable; evaluation metrics
early_stopping_rounds: int or None
outputs:
df_lb: a dataframe with predicted duration
"""
df_lb = df[candidates].copy().reset_index()
for train_fold,cv_fold in TimeSeriesSplit(5).split(df_lb):
fold_X_train=df_lb.drop(['duration','actual_delivery_time','created_at','create_at_date'], axis=1).iloc[train_fold,:]
fold_y_train = df_lb.loc[train_fold,'duration']
fold_X_val= df_lb.drop(['duration','actual_delivery_time','created_at','create_at_date'], axis=1).iloc[cv_fold,:]
fold_y_val = df_lb.loc[cv_fold,'duration']
lgb_train = lgb.Dataset(fold_X_train, fold_y_train)
lgb_cv = lgb.Dataset(fold_X_val, fold_y_val)
lbm_model = lgb.train(params,lgb_train,
fobj=fobj,
feval=feval,
# num_boost_round=500,
valid_sets=lgb_cv,
# early_stopping_rounds=early_stopping_rounds,
categorical_feature = categorical_features.tolist(),
verbose_eval=False)
df_lb.loc[cv_fold,'predicted_duration'] = lbm_model.predict(fold_X_val)
return df_lb
params_gold = {
'num_leaves': int(lgb_best_param['num_leaves']),
'learning_rate': lgb_best_param['learning_rate'],
"min_data_in_leaf": int(lgb_best_param['min_data_in_leaf']),
"num_boost_round":int(lgb_best_param['num_boost_round']),
'early_stopping_rounds':None,
'verbose': -1,
'saved_feature_importance_type':1,
# 'metric': "l2"
}
df_gold = lgb_cv_oos_prediction(params_gold,df.sort_values('created_at'),candidates,categorical_features,fobj=qmse,feval=qmse_metric)
df_eval=df.merge(df_cv[['index','predicted_duration']].rename(columns={'predicted_duration':'pred_duration_Ridge'}),
left_index=True,right_on='index',how='inner',validate="1:1",suffixes=[False,False])\
.merge(df_gold[['index','predicted_duration']].rename(columns={'predicted_duration':'pred_duration_lgb_gold'}),
how='inner',validate="1:1",suffixes=[False,False])\
.sort_values('created_at')
for y_pred,name in zip(['pred_duration_Ridge','pred_duration_lgb_gold'],['Ridge','lgb_gold']):
df_eval[name+'_resid'] = df_eval['duration'] - df_eval[y_pred]
df_eval[name+'_resid_abs'] = np.abs(df_eval[name+'_resid'])
df_eval[name+'_resid2'] = df_eval[name+'_resid']**2
df_eval[name+'_resid_asym'] = np.where(df_eval[name+'_resid']<0,df_eval[name+'_resid2'],2*df_eval[name+'_resid2'])
I will look at the symmetric RMSE and asymmetric RMSE where under-prediction is penalized twice as much as over-prediction.
The optimized LGBM model preformes the best.
rmse_res = pd.DataFrame((df_eval.filter(regex="resid2").mean()**0.5/60).sort_index(),columns = ['Sym. RMSE (in minutes)']).reset_index()
rmse_res.replace({'index':{'lgb_gold_resid2':'LGB(Optimized)',
'Ridge_resid2':'Ridge'}},inplace=True)
armse_res = pd.DataFrame((df_eval.filter(regex="resid_asym").mean()**0.5/60).sort_index(),columns = ['Asym. RMSE (in minutes)']).reset_index()
armse_res.replace({'index':{'lgb_gold_resid_asym':'LGB(Optimized)',
'Ridge_resid_asym':'Ridge'}},inplace=True)
mae_res = pd.DataFrame((df_eval.filter(regex="resid_abs").mean()/60).sort_index(),columns = ['MAE (in minutes)']).reset_index()
mae_res.replace({'index':{'lgb_gold_resid_abs':'LGB(Optimized)',
'Ridge_resid_abs':'Ridge'}},inplace=True)
res = armse_res.merge(rmse_res).merge(mae_res)
res.rename(columns={'index':'model'}).set_index('model').sort_values('Asym. RMSE (in minutes)')
| Asym. RMSE (in minutes) | Sym. RMSE (in minutes) | MAE (in minutes) | |
|---|---|---|---|
| model | |||
| LGB(Optimized) | 22.041877 | 17.209019 | 11.906332 |
| Ridge | 23.069622 | 17.166848 | 11.053351 |
The Ridge regression generally under-predicts, so its residual distribution is to the right of the rest two.
fig,ax = plt.subplots(3,1,figsize=(6,4*3),dpi=120)
df_plot= df_eval[['Ridge_resid','lgb_gold_resid']].dropna().melt(value_vars=['Ridge_resid','lgb_gold_resid'],
var_name = 'Model',value_name='Residual')
sns.histplot(df_plot, x="Residual", hue="Model", element="step",stat="density", common_norm=False,ax=ax[0])
ax[0].set_xlim(-1e4,2*1e4);
sns.histplot(df_plot, x="Residual", hue="Model", element="step",stat="density", common_norm=False,ax=ax[1])
ax[1].set_xlim(-4000,4000);
ax[1].set_title("Distribution of Residual: Zoomed in");
sns.histplot(df_plot, x="Residual", hue="Model", element="step",stat="density", common_norm=False,ax=ax[2])
ax[2].set_xlim(4000,10000)
ax[2].set_ylim(0,0.5*1e-5);
ax[2].set_title("Distribution of Residual: Zoomed in the right tail");
plt.tight_layout()
Lastly, I will look at the total delivery time per day.
ts = df_eval[['create_at_date','duration','pred_duration_Ridge','pred_duration_lgb_gold']]\
.dropna().groupby('create_at_date').sum()
fig,ax=plt.subplots(figsize=(10,6),dpi=120)
ax.plot(ts['duration'],label='Actual Duration')
ax.plot(ts['pred_duration_Ridge'],label='Pred Duration:Ridge',ls = 'dotted',color = 'red',alpha=0.7)
ax.plot(ts['pred_duration_lgb_gold'],label='Pred Duration:Tuned LGBM',ls = '--',color = 'green',alpha=0.8)
ax.legend()
ax.set_title('Total Delivery Duration Per Day');
params_gold
{'num_leaves': 87,
'learning_rate': 0.12614126473929443,
'min_data_in_leaf': 884,
'num_boost_round': 37,
'early_stopping_rounds': None,
'verbose': -1,
'saved_feature_importance_type': 1}
lgb_final = lgb.train(params_gold,lgb_data,
fobj=qmse,
feval=qmse_metric,
categorical_feature = categorical_features.tolist(),
verbose_eval=10)
df['pred_duration'] = lgb_final.predict(X)
df['resid'] = df['duration'] - df['pred_duration']
df['resid_abs'] = np.abs(df['resid'])
df['resid2'] = df['resid']**2
df['resid_asym'] = np.where(df['resid']<0,df['resid2'],2*df['resid2'])
df[['resid_asym','resid2']].mean()**0.5/60
resid_asym 19.989790 resid2 16.001925 dtype: float64
df[['resid_abs']].mean()/60
resid_abs 11.179691 dtype: float64
fig,ax = plt.subplots(figsize=(6,4),dpi=120)
sns.histplot(df['resid'].dropna(), element="step",stat="density", common_norm=False,ax=ax)
ax.set_xlim(-1e4,1e4);
plt.tight_layout()
ts = df[['create_at_date','duration','pred_duration']].dropna().groupby('create_at_date').sum()
fig,ax=plt.subplots(figsize=(10,6),dpi=120)
ax.plot(ts['duration'],label='Actual Duration')
ax.plot(ts['pred_duration'],label='Pred Duration:Tuned LGBM',ls = '--',color = 'red',alpha=0.8)
ax.legend()
ax.set_title('Total Duration Per Day (all training sample)');
df_feature_importance = (
pd.DataFrame({
'feature': lgb_final.feature_name(),
'importance': lgb_final.feature_importance(),
})
.sort_values('importance', ascending=False)
.set_index('feature')
)
fig, ax = plt.subplots(figsize=(6,6),dpi=120)
df_feature_importance.plot.bar(ax=ax)
ax.set_title("Feature importances from LightGBM")
ax.set_ylabel("Total gains of splits using each feature")
# ax.set_yscale('log')
fig.tight_layout()
explainer = shap.TreeExplainer(lgb_final)
shap_values = explainer.shap_values(X)
# summarize the effects of all the features
shap.summary_plot(shap_values, X)
We can also see the partial dependence (directional) here:
mask = (X["total_order_per_onshift_dasher"]<7.5)&(X['total_items']<50)\
&(X['subtotal']<15000)&(X['min_item_price']<7000)&(X['estimated_order_place_duration']<1500)\
&(X['store_id_encoded']<10000)
for name in X.columns.values:#['total_order_per_onshift_dasher','estimated_store_to_consumer_driving_duration','subtotal']:
shap.dependence_plot(name, shap_values[mask], X[mask],
display_features=X[mask])
X_submit = df_submit.sort_values('created_at')[set(candidates) - set(['actual_delivery_time','duration'])]\
.drop(['created_at','create_at_date'], axis=1).copy()[X.columns]
# for var in ['store_primary_category_encoded','created_hour_encoded','total_free_dasher_band_encoded','total_free_dasher_percent_band_encoded']:
# X_submit[var] = X_submit[var].astype(float)
X_submit.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 54778 entries, 0 to 54777 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 54778 non-null category 1 order_protocol 54495 non-null category 2 total_items 54778 non-null int64 3 subtotal 54778 non-null int64 4 num_distinct_items 54778 non-null int64 5 min_item_price 54774 non-null float64 6 max_item_price 54778 non-null int64 7 total_onshift_dashers 54778 non-null float64 8 total_busy_dashers 54778 non-null float64 9 total_outstanding_orders 54778 non-null float64 10 estimated_order_place_duration 54778 non-null int64 11 estimated_store_to_consumer_driving_duration 54778 non-null float64 12 created_day_of_week 54778 non-null category 13 created_month 54778 non-null category 14 total_order_per_onshift_dasher 54778 non-null float64 15 store_id_encoded 54778 non-null float64 16 store_primary_category_encoded 54778 non-null float64 17 created_hour_encoded 54778 non-null float64 18 total_free_dasher_band_encoded 54778 non-null float64 19 total_free_dasher_percent_band_encoded 54778 non-null float64 20 mean_duration 54778 non-null float64 dtypes: category(4), float64(12), int64(5) memory usage: 7.7 MB
df_submit['pred_duration'] = lgb_final.predict(X_submit)
df_submit.to_csv('submit.csv',index=False)
lgb_final.save_model('lgb_final.txt');
The following features are values at the time of created_at (order submission time)
total_onshift_dashers who are currently working on an orderWe have predictions from other models for various stages of delivery process that we can use.
# # some dropped due to nan or perfect collinearity
# x_col =[
# 'subtotal',
# 'num_distinct_items',
# 'max_item_price',
# 'total_onshift_dashers',
# 'total_busy_dashers',
# 'total_outstanding_orders',
# 'estimated_order_place_duration',
# 'estimated_store_to_consumer_driving_duration',
# 'total_order_per_onshift_dasher',
# 'market_id_1',
# 'market_id_2',
# 'market_id_3',
# 'market_id_4',
# 'market_id_5',
# 'order_protocol_1',
# 'order_protocol_2',
# 'order_protocol_3',
# 'order_protocol_4',
# 'order_protocol_5',
# 'order_protocol_6',
# 'created_day_of_week_1',
# 'created_day_of_week_2',
# 'created_day_of_week_3',
# 'created_day_of_week_4',
# 'created_day_of_week_5',
# 'created_day_of_week_6',
# 'created_hour_0',
# 'created_hour_1',
# 'created_hour_2',
# 'created_hour_3',
# 'created_hour_4',
# 'created_hour_5',
# 'created_hour_6',
# 'created_hour_7',
# 'created_hour_14',
# 'created_hour_15',
# 'created_hour_16',
# 'created_hour_17',
# 'created_hour_18',
# 'created_hour_19',
# 'created_hour_20',
# 'created_hour_21',
# 'created_hour_22']
# formula = 'np.log(duration) ~ '+"+".join(x_col)
# model_linear = smf.ols(formula, df_with_dummy.dropna())
# res = model_linear.fit(cov_type='cluster', cov_kwds={'groups': df_with_dummy.dropna()['market_id']})
# res.summary()
# X = add_constant(df_with_dummy[x_col]).dropna()
# pd.Series([variance_inflation_factor(X.values, i)
# for i in range(X.shape[1])],
# index=X.columns)